{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![seq2seq_attention_intent_slot模型](img/model.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "'''\n",
    "对话中的意图识别和槽填充联合模型：\n",
    "这里实现了《Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling》中的另一个模型attention-based rNN model，如上图\n",
    "此模型利用birnn-attention实现：\n",
    "1.意图识别是利用encoder中的最后一个time step中的双向隐层，再加上平均池化或者利用attention加权平均，最后接一个fc层进行分类\n",
    "2.槽填充是序列标注，双向隐状态加attention权重，最后也是一个fc层分类。这里注意一点是，槽的每一步的预测输出会输入到birnn中的前向传输时间步中。\n",
    "3.总的loss = 意图识别loss + 槽填充loss\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from torchtext import data, datasets\n",
    "import pandas as pd\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
    "atis_data = os.path.join(base_dir, 'atis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "build train and val dataset\n",
    "'''\n",
    "    \n",
    "tokenize = lambda s:s.split()\n",
    "\n",
    "SOURCE = data.Field(sequential=True, tokenize=tokenize,\n",
    "                    lower=True, use_vocab=True,\n",
    "                    init_token='<sos>', eos_token='<eos>',\n",
    "                    pad_token='<pad>', unk_token='<unk>',\n",
    "                    batch_first=True, fix_length=50,\n",
    "                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence\n",
    "\n",
    "TARGET = data.Field(sequential=True, tokenize=tokenize,\n",
    "                    lower=True, use_vocab=True,\n",
    "                    init_token='<sos>', eos_token='<eos>',\n",
    "                    pad_token='<pad>', unk_token='<unk>',\n",
    "                    batch_first=True, fix_length=50,\n",
    "                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence\n",
    "LABEL = data.Field(\n",
    "                sequential=False,\n",
    "                use_vocab=True)\n",
    "\n",
    "train, val = data.TabularDataset.splits(\n",
    "                                        path=atis_data,\n",
    "                                        skip_header=True,\n",
    "                                        train='atis.train.csv',\n",
    "                                        validation='atis.test.csv',\n",
    "                                        format='csv',\n",
    "                                        fields=[('index', None), ('intent', LABEL), ('source', SOURCE), ('target', TARGET)])\n",
    "\n",
    "SOURCE.build_vocab(train, val)\n",
    "TARGET.build_vocab(train, val)\n",
    "LABEL.build_vocab(train, val)\n",
    "\n",
    "train_iter, val_iter = data.Iterator.splits(\n",
    "                                            (train, val),\n",
    "                                            batch_sizes=(32, len(val)), # 训练集设置为64,验证集整个集合用于测试\n",
    "                                            shuffle=True,\n",
    "                                            sort_within_batch=True, #为true则一个batch内的数据会按sort_key规则降序排序\n",
    "                                            sort_key=lambda x: len(x.source)) #这里按src的长度降序排序，主要是为后面pack,pad操作)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SOURCE.vocab.stoi[SOURCE.pad_token]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save source words\n",
    "source_words_path = os.path.join(os.getcwd(), 'source_words.pkl')\n",
    "with open(source_words_path, 'wb') as f_source_words:\n",
    "    pickle.dump(SOURCE.vocab, f_source_words)\n",
    "\n",
    "# save target words\n",
    "target_words_path = os.path.join(os.getcwd(), 'target_words.pkl')\n",
    "with open(target_words_path, 'wb') as f_target_words:\n",
    "    pickle.dump(TARGET.vocab, f_target_words)\n",
    "    \n",
    "# save label words\n",
    "label_words_path = os.path.join(os.getcwd(), 'label_words.pkl')\n",
    "with open(label_words_path, 'wb') as f_label_words:\n",
    "    pickle.dump(LABEL.vocab, f_label_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "from torch.optim import lr_scheduler\n",
    "import math\n",
    "from apex import amp\n",
    "import time\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# build model\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "# 构建attention权重计算方式\n",
    "class Attention(nn.Module):\n",
    "    def __init__(self, hidden_dim):\n",
    "        super(Attention, self).__init__()\n",
    "        self.attn = nn.Linear((hidden_dim * 2), hidden_dim)\n",
    "        self.v = nn.Linear(hidden_dim, 1, bias=False)\n",
    "\n",
    "    def concat_score(self, hidden, encoder_output):\n",
    "        seq_len = encoder_output.shape[1]\n",
    "        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1) # [batch_size, seq_len, hidden_size]\n",
    "        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_output),dim=2))) # [batch_size, seq_len, hidden_dim]\n",
    "        attention = self.v(energy).squeeze(2) #[batch_size, seq_len]\n",
    "        return attention #[batch_size, seq_len]\n",
    "\n",
    "    def forward(self, hidden, encoder_output):\n",
    "        # hidden = [batch_size, hidden_size]\n",
    "        # #encoder_output=[batch_size, seq_len, hidden_size]\n",
    "        \n",
    "        attn_energies = self.concat_score(hidden, encoder_output)\n",
    "\n",
    "        return F.softmax(attn_energies, dim=1).unsqueeze(1) #softmax归一化，[batch_size, 1, seq_len]\n",
    "    \n",
    "#构建模型\n",
    "class BirnnAttention(nn.Module):\n",
    "    def __init__(self, source_input_dim, source_emb_dim, hidden_dim, n_layers, dropout, pad_index, slot_output_size, intent_output_size, slot_embed_dim, predict_flag):\n",
    "        super(BirnnAttention, self).__init__()\n",
    "        self.pad_index = pad_index\n",
    "        self.hidden_dim = hidden_dim//2 # 双向lstm\n",
    "        self.n_layers = n_layers\n",
    "        self.slot_output_size = slot_output_size\n",
    "        # 是否预测模式\n",
    "        self.predict_flag = predict_flag\n",
    "        \n",
    "        self.source_embedding = nn.Embedding(source_input_dim, source_emb_dim, padding_idx=pad_index)\n",
    "        # 双向gru，隐层维度是hidden_dim\n",
    "        self.source_gru = nn.GRU(source_emb_dim, self.hidden_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True) #使用双向\n",
    "        \n",
    "        \n",
    "        # 单个cell的隐层维度与gru隐层维度一样，为hidden_dim\n",
    "        self.gru_cell = nn.GRUCell(slot_embed_dim + (2 * hidden_dim), hidden_dim)\n",
    "        self.attention = Attention(hidden_dim)\n",
    "        # 意图intent预测\n",
    "        self.intent_output = nn.Linear(hidden_dim * 2, intent_output_size)\n",
    "        # 槽slot预测\n",
    "        self.slot_output = nn.Linear(hidden_dim, slot_output_size)\n",
    "        self.slot_embedding = nn.Embedding(slot_output_size, slot_embed_dim)\n",
    "        \n",
    "    def forward(self, source_input, source_len):\n",
    "        '''\n",
    "        source_input:[batch_size, seq_len]\n",
    "        source_len:[batch_size]\n",
    "        '''\n",
    "        if self.predict_flag:\n",
    "            assert len(source_input) == 1, '预测时一次输入一句话'\n",
    "            seq_len = source_len[0]\n",
    "            \n",
    "            # 1.Encoder阶段，将输入的source进行编码\n",
    "            # source_embedded:[batch_size, seq_len, source_emb_dim]\n",
    "            source_embedded = self.source_embedding(source_input)\n",
    "            packed = torch.nn.utils.rnn.pack_padded_sequence(source_embedded, source_len, batch_first=True, enforce_sorted=True) #这里enfore_sotred=True要求数据根据词数排序\n",
    "            source_output, hidden = self.source_gru(packed)\n",
    "            # source_output=[batch_size, seq_len, 2 * self.hidden_size]，这里的2*self.hidden_size = hidden_dim\n",
    "            # hidden=[n_layers * 2, batch_size, self.hidden_size]\n",
    "            source_output, _ = torch.nn.utils.rnn.pad_packed_sequence(source_output, batch_first=True, padding_value=self.pad_index, total_length=len(source_input[0])) #这个会返回output以及压缩后的legnths\n",
    "            '''\n",
    "            source_hidden[-2,:,:]是gru最后一步的forward\n",
    "            source_hidden[-1,:,:]是gru最后一步的backward\n",
    "            '''\n",
    "            # source_hidden=[batch_size, 2*self.hidden_size]\n",
    "            source_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)\n",
    "            #保存注意力向量\n",
    "            attention_context = torch.zeros(1, seq_len, self.hidden_dim * 2)\n",
    "            output_tokens = []\n",
    "           \n",
    "            aligns = source_output.transpose(0,1) #对齐向量\n",
    "            \n",
    "            input = torch.tensor(2).unsqueeze(0)  # 预测阶段解码器输入第一个token-> <sos>\n",
    "            for s in range(1, seq_len):\n",
    "                aligned = aligns[s].unsqueeze(1)# [batch_size, 1, hidden_size*2]\n",
    "                 # embedded=[1, 1, slot_embed_dim]\n",
    "                slot_embedded = self.slot_embedding(input)\n",
    "                slot_embedded = slot_embedded.unsqueeze(0)\n",
    "                # 利用利用上一步的hidden与encoder_output，计算attention权重\n",
    "                # attention_weights=[batch_size, 1, seq_len]\n",
    "                attention_weights = self.attention(source_hidden, source_output)\n",
    "\n",
    "                '''\n",
    "                以下是计算上下文：利用attention权重与encoder_output计算attention上下文向量\n",
    "                注意力权重分布用于产生编码器隐藏状态的加权和，加权平均的过程。得到的向量称为上下文向量\n",
    "                '''\n",
    "                context = attention_weights.bmm(source_output)\n",
    "                attention_context[:,s,:] = context\n",
    "              \n",
    "                combined_grucell_input = torch.cat([aligned, slot_embedded, context], dim =2)\n",
    "             \n",
    "                source_hidden = self.gru_cell(combined_grucell_input.squeeze(1), source_hidden)\n",
    "              \n",
    "                slot_prediction = self.slot_output(source_hidden)\n",
    "            \n",
    "                input = slot_prediction.argmax(1)\n",
    "                output_token = input.squeeze().detach().item()\n",
    "               \n",
    "                output_tokens.append(output_token)\n",
    "            \n",
    "             #意图识别\n",
    "            #拼接注意力向量和encoder的输出\n",
    "            combined_attention_sourceoutput = torch.cat([attention_context, source_output], dim=2)\n",
    "            intent_outputs = self.intent_output(torch.mean(combined_attention_sourceoutput, dim = 1))\n",
    "            intent_outputs = intent_outputs.squeeze()\n",
    "            intent_outputs = intent_outputs.argmax()\n",
    "            return output_tokens, intent_outputs\n",
    "        \n",
    "        else:\n",
    "            # 1.Encoder阶段，将输入的source进行编码\n",
    "            # source_embedded:[batch_size, seq_len, source_emb_dim]\n",
    "            source_embedded = self.source_embedding(source_input)\n",
    "            packed = torch.nn.utils.rnn.pack_padded_sequence(source_embedded, source_len, batch_first=True, enforce_sorted=True) #这里enfore_sotred=True要求数据根据词数排序\n",
    "            source_output, hidden = self.source_gru(packed)\n",
    "            # source_output=[batch_size, seq_len, 2 * self.hidden_size]，这里的2*self.hidden_size = hidden_dim\n",
    "            # hidden=[n_layers * 2, batch_size, self.hidden_size]\n",
    "            source_output, _ = torch.nn.utils.rnn.pad_packed_sequence(source_output, batch_first=True, padding_value=self.pad_index, total_length=len(source_input[0])) #这个会返回output以及压缩后的legnths\n",
    "            '''\n",
    "            source_hidden[-2,:,:]是gru最后一步的forward\n",
    "            source_hidden[-1,:,:]是gru最后一步的backward\n",
    "            '''\n",
    "            # source_hidden=[batch_size, 2*self.hidden_size]\n",
    "            source_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)\n",
    "\n",
    "\n",
    "            # 2.Decoder阶段，预测slot与intent\n",
    "            batch_size = source_input.shape[0]\n",
    "            seq_len = source_input.shape[1]\n",
    "            # 保存slot的预测概率\n",
    "            slot_outputs = torch.zeros(batch_size, seq_len, self.slot_output_size).to(device)\n",
    "\n",
    "            #保存注意力向量\n",
    "            attention_context = torch.zeros(batch_size, seq_len, self.hidden_dim * 2).to(device)\n",
    "\n",
    "            # 每个batch数据的第一个字符<sos>对应的是index是2\n",
    "            input = torch.tensor(2).repeat(batch_size).to(device)\n",
    "            aligns = source_output.transpose(0,1) # 利用encoder output最后一层的每一个时间步\n",
    "            # 槽识别\n",
    "            for t in range(1, seq_len):\n",
    "                '''\n",
    "                解码器输入的初始hidden为encoder的最后一步的hidden\n",
    "                接收输出即predictions和新的hidden状态\n",
    "                '''\n",
    "                aligned = aligns[t].unsqueeze(1)# [batch_size, 1, hidden_size] # hidden_size包含前向和后向隐状态向量\n",
    "                input = input.unsqueeze(1)\n",
    "                # input=[batch_size, 1]\n",
    "                # hidden=[batch_size, hidden_size] 初始化为encoder的最后一层 [batch_size, hidden_size]\n",
    "                # encoder_output=[batch_size, seq_len, hidden_dim*2]\n",
    "                # aligned=[batch_size, 1, hidden_dim*2]\n",
    "\n",
    "                # embedded=[batch_sze, 1, slot_embed_dim]\n",
    "                slot_embedded = self.slot_embedding(input)\n",
    "\n",
    "                # 利用利用上一步的hidden与encoder_output，计算attention权重\n",
    "                # attention_weights=[batch_size, 1, seq_len]\n",
    "                attention_weights = self.attention(source_hidden, source_output)\n",
    "\n",
    "                '''\n",
    "                以下是计算上下文：利用attention权重与encoder_output计算attention上下文向量\n",
    "                注意力权重分布用于产生编码器隐藏状态的加权和，加权平均的过程。得到的向量称为上下文向量\n",
    "                '''\n",
    "                context = attention_weights.bmm(source_output) # [batch_size, 1, seq_len] * [batch_size, seq_len, hidden_dim]=[batch_size, 1, hidden_dim]\n",
    "                attention_context[:,t,:] = context.squeeze(1)\n",
    "                #combined_grucell_input=[batch_size, 1, (hidden_size + slot_embed_dim + hidden_dim)]\n",
    "                combined_grucell_input = torch.cat([aligned, slot_embedded, context], dim =2)\n",
    "                # [batch_size, hidden_dim]\n",
    "                source_hidden = self.gru_cell(combined_grucell_input.squeeze(1), source_hidden)\n",
    "                # 预测slot, [batch_size, slot_output_size]\n",
    "                slot_prediction = self.slot_output(source_hidden)\n",
    "                slot_outputs[:, t, :] = slot_prediction\n",
    "                # 获取预测的最大概率的token\n",
    "                input = slot_prediction.argmax(1)\n",
    "            #意图识别\n",
    "            #拼接注意力向量和encoder的输出，[batch_size, seq_len, hidden_dim * 2]\n",
    "            combined_attention_sourceoutput = torch.cat([attention_context, source_output], dim=2)\n",
    "            intent_outputs = self.intent_output(torch.mean(combined_attention_sourceoutput, dim = 1))\n",
    "\n",
    "            return slot_outputs, intent_outputs\n",
    "        \n",
    "\n",
    "# 构建模型，优化函数，损失函数，学习率衰减函数\n",
    "def build_model(source, target, label, source_emb_dim, hidden_dim, n_layers, dropout, slot_embed_dim, lr, gamma, weight_decay):\n",
    "    '''\n",
    "    训练seq2seq model\n",
    "    input与output的维度是字典的大小。\n",
    "    encoder与decoder的embedding与dropout可以不同\n",
    "    网络的层数与hiden/cell状态的size必须相同\n",
    "    '''\n",
    "    input_dim = len(source.vocab) # source 词典大小（即词数量）\n",
    "    output_dim = len(target.vocab) # target 词典大小（即实体类型数量）\n",
    "    label_dim = len(label.vocab) # label 词典大小（即意图类别数量）\n",
    "    \n",
    "    model = BirnnAttention(input_dim, source_emb_dim, hidden_dim, n_layers, dropout, source.vocab.stoi[source.pad_token], output_dim, label_dim, slot_embed_dim, False).to(device)\n",
    "   \n",
    "    model.apply(init_weights)\n",
    "\n",
    "    # 定义优化函数\n",
    "    optimizer = optim.Adam(model.parameters(), lr=lr) #, weight_decay=weight_decay)\n",
    "    #optimizer = torch.optim.SGD(model.parameters(), lr=lr) #, momentum=0.9, nesterov=True)\n",
    "    # 定义lr衰减\n",
    "    #scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=gamma)\n",
    "    \n",
    "    '''\n",
    "        当网络的评价指标不在提升的时候，可以通过降低网络的学习率来提高网络性能:\n",
    "        optimer指的是网络的优化器\n",
    "        mode (str) ，可选择‘min’或者‘max’，min表示当监控量停止下降的时候，学习率将减小，max表示当监控量停止上升的时候，学习率将减小。默认值为‘min’\n",
    "        factor 学习率每次降低多少，new_lr = old_lr * factor\n",
    "        patience=10，容忍网路的性能不提升的次数，高于这个次数就降低学习率\n",
    "        verbose（bool） - 如果为True，则为每次更新向stdout输出一条消息。 默认值：False\n",
    "        threshold（float） - 测量新最佳值的阈值，仅关注重大变化。 默认值：1e-4\n",
    "        cooldown(int)： 冷却时间“，当调整学习率之后，让学习率调整策略冷静一下，让模型再训练一段时间，再重启监测模式。\n",
    "        min_lr(float or list):学习率下限，可为 float，或者 list，当有多个参数组时，可用 list 进行设置。\n",
    "        eps(float):学习率衰减的最小值，当学习率变化小于 eps 时，则不调整学习率。\n",
    "    '''\n",
    "    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.1, patience=2, verbose=False)\n",
    "    # 这里忽略<pad>的损失。\n",
    "    target_pad_index = target.vocab.stoi[source.pad_token]\n",
    "    # 定义损失函数(实体识别)\n",
    "    loss_slot = nn.CrossEntropyLoss(ignore_index=target_pad_index)\n",
    "    # 定义损失函数(意图识别)\n",
    "    loss_intent = nn.CrossEntropyLoss()\n",
    "    \n",
    "    return model, optimizer, scheduler, loss_slot, loss_intent\n",
    "\n",
    "\n",
    "# 训练\n",
    "def train(model, iterator, optimizer, loss_slot, loss_intent, clip):\n",
    "    '''\n",
    "    开始训练：\n",
    "        1.得到source与target句子\n",
    "        2.上一批batch的计算梯度归0\n",
    "        3.给模型喂source与target，并得到输出output\n",
    "        4.由于损失函数只适用于带有1维target和2维的input，我们需要用view进行flatten(在计算损失时，从output与target中忽略了第一列<sos>)\n",
    "        5.反向传播计算梯度loss.backward()\n",
    "        6.梯度裁剪，防止梯度爆炸\n",
    "        7.更新模型参数\n",
    "        8.损失值求和(返回所有batch的损失的均值)\n",
    "    '''\n",
    "    model.train()\n",
    "    epoch_loss = 0\n",
    "\n",
    "    for i, batch in enumerate(iterator):\n",
    "        src, src_lens = batch.source  # src=[batch_size, seq_len]，这里batch.src返回src和src的长度，因为在使用torchtext.Field时设置include_lengths=True\n",
    "        trg, _ = batch.target  # trg=[batch_size, seq_len]\n",
    "        label = batch.intent # [batch_size]\n",
    "        src = src.to(device)\n",
    "        trg = trg.to(device)\n",
    "        label = label.to(device)\n",
    "        \n",
    "        #slot_outputs=[batch_size, trg_len, trg_vocab_size], intetn_outputs=[batch_size, intent_size]\n",
    "        slot_outputs, intent_outputs = model(src, src_lens)\n",
    "        \n",
    "        # 以下在计算损失时，忽略了每个tensor的第一个元素及<sos>\n",
    "        output_dim = slot_outputs.shape[-1]\n",
    "        slot_outputs = slot_outputs[:, 1:, :].reshape(-1, output_dim)  # output=[batch_size * (seq_len - 1), output_dim]\n",
    "        trg = trg[:, 1:].reshape(-1)  # trg=[batch_size * (seq_len - 1)]\n",
    "        loss1 = loss_slot(slot_outputs, trg)\n",
    "        loss2 = loss_intent(intent_outputs, label)\n",
    "        loss = loss1 + loss2\n",
    "        \n",
    "        with amp.scale_loss(loss, optimizer) as scaled_loss:\n",
    "            scaled_loss.backward()\n",
    "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
    "        optimizer.step()\n",
    "        optimizer.zero_grad()\n",
    "        epoch_loss += float(loss.item())\n",
    "        # print('epoch_loss:{}'.format(float(loss.item())))\n",
    "    return epoch_loss / len(iterator)\n",
    "\n",
    "'''\n",
    "评估\n",
    "'''\n",
    "def evaluate(model, iterator, loss_slot, loss_intent):\n",
    "    model.eval()  # 评估模型，切断dropout与batchnorm\n",
    "    epoch_loss = 0\n",
    "    with torch.no_grad():  # 不更新梯度\n",
    "        for i, batch in enumerate(iterator):\n",
    "            src, src_len = batch.source  # src=[batch_size, seq_len]\n",
    "            trg, _ = batch.target  # trg=[batch_size, seq_len]\n",
    "            label = batch.intent\n",
    "            src = src.to(device)\n",
    "            trg = trg.to(device)\n",
    "            label = label.to(device)\n",
    "            # output=[batch_size, seq_len, output_dim]\n",
    "            slot_outputs, intent_outputs = model(src, src_len)\n",
    "\n",
    "            output_dim = slot_outputs.shape[-1]\n",
    "            slot_outputs = slot_outputs[:, 1:, :].reshape(-1, output_dim)  # output=[batch_size * (seq_len - 1), output_dim]\n",
    "            trg = trg[:, 1:].reshape(-1)  # trg=[batch_size * (seq_len - 1)]\n",
    "            loss1 = loss_slot(slot_outputs, trg)\n",
    "            loss2 = loss_intent(intent_outputs, label)\n",
    "            loss = loss1 + loss2\n",
    "            epoch_loss += float(loss.item())\n",
    "    return epoch_loss / len(iterator)\n",
    "\n",
    "\n",
    "def train_model(model, train_iterator, val_iterator, optimizer, scheduler, loss_slot, loss_intent, n_epochs, clip, model_path, writer):\n",
    "    '''\n",
    "    开始训练我们的模型：\n",
    "    1.每一次epoch，都会检查模型是否达到的最佳的validation loss，如果达到了，就更新\n",
    "    最好的validation loss以及保存模型参数\n",
    "    2.打印每个epoch的loss以及困惑度。\n",
    "    '''\n",
    "    best_valid_loss = float('inf')\n",
    "    for epoch in range(n_epochs):\n",
    "        start_time = time.time()\n",
    "        train_loss = train(model, train_iterator, optimizer, loss_slot, loss_intent, clip)\n",
    "        writer.add_scalar('loss',train_loss,global_step=epoch+1)\n",
    "        \n",
    "        valid_loss = evaluate(model, val_iterator, loss_slot, loss_intent)\n",
    "        end_time = time.time()\n",
    "        epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
    "\n",
    "        if valid_loss < best_valid_loss:\n",
    "            best_valid_loss = valid_loss\n",
    "            torch.save(model.state_dict(), model_path)\n",
    "        scheduler.step(valid_loss)\n",
    "        print('epoch:{},time-mins:{},time-secs:{}'.format(epoch + 1, epoch_mins, epoch_secs))\n",
    "        print('train loss:{},train perplexity:{}'.format(train_loss, math.exp(train_loss)))\n",
    "        print('val loss:{}, val perplexity:{}'.format(valid_loss, math.exp(valid_loss)))\n",
    "    writer.flush()\n",
    "    writer.close()\n",
    "\n",
    "    #每个epoch所花时间\n",
    "def epoch_time(start_time, end_time):\n",
    "    run_tim = end_time - start_time\n",
    "    run_mins = int(run_tim / 60)\n",
    "    run_secs = int(run_tim-(run_mins * 60))\n",
    "    return run_mins,run_secs\n",
    "\n",
    "#对所有模块和子模块进行权重初始化\n",
    "def init_weights(model):\n",
    "    for name,param in model.named_parameters():\n",
    "        nn.init.uniform_(param.data, -0.08, 0.08)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\torch\\nn\\modules\\rnn.py:50: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.5 and num_layers=1\n",
      "  \"num_layers={}\".format(dropout, num_layers))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n",
      "\n",
      "Defaults for this optimization level are:\n",
      "enabled                : True\n",
      "opt_level              : O1\n",
      "cast_model_type        : None\n",
      "patch_torch_functions  : True\n",
      "keep_batchnorm_fp32    : None\n",
      "master_weights         : None\n",
      "loss_scale             : dynamic\n",
      "Processing user overrides (additional kwargs that are not None)...\n",
      "After processing overrides, optimization options are:\n",
      "enabled                : True\n",
      "opt_level              : O1\n",
      "cast_model_type        : None\n",
      "patch_torch_functions  : True\n",
      "keep_batchnorm_fp32    : None\n",
      "master_weights         : None\n",
      "loss_scale             : dynamic\n",
      "Warning:  multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback.  Original ImportError was: ModuleNotFoundError(\"No module named 'amp_C'\",)\n",
      "epoch:1,time-mins:0,time-secs:43\n",
      "train loss:1.278574818219894,train perplexity:3.591517511073177\n",
      "val loss:0.768112301826477, val perplexity:2.1556931126125054\n",
      "epoch:2,time-mins:0,time-secs:43\n",
      "train loss:0.29190145552349395,train perplexity:1.3389710629352432\n",
      "val loss:0.563666582107544, val perplexity:1.7571032670244615\n",
      "epoch:3,time-mins:0,time-secs:43\n",
      "train loss:0.18844216558127067,train perplexity:1.2073672535840592\n",
      "val loss:0.43828073143959045, val perplexity:1.5500399913039704\n",
      "epoch:4,time-mins:0,time-secs:43\n",
      "train loss:0.1275235006096176,train perplexity:1.1360115648917746\n",
      "val loss:0.5251220464706421, val perplexity:1.6906651755057944\n",
      "epoch:5,time-mins:0,time-secs:43\n",
      "train loss:0.0948597255270355,train perplexity:1.0995046118782075\n",
      "val loss:0.454354465007782, val perplexity:1.5751562362986666\n",
      "epoch:6,time-mins:0,time-secs:43\n",
      "train loss:0.07198656744395311,train perplexity:1.074640908792625\n",
      "val loss:0.47257575392723083, val perplexity:1.6041206966914077\n",
      "epoch:7,time-mins:0,time-secs:43\n",
      "train loss:0.03791102629232531,train perplexity:1.038638817220774\n",
      "val loss:0.44630342721939087, val perplexity:1.562525507381591\n",
      "epoch:8,time-mins:0,time-secs:43\n",
      "train loss:0.017891304857002046,train perplexity:1.0180523130338734\n",
      "val loss:0.44639068841934204, val perplexity:1.562661861181431\n",
      "epoch:9,time-mins:0,time-secs:43\n",
      "train loss:0.01226190540369433,train perplexity:1.0123373907817579\n",
      "val loss:0.44741660356521606, val perplexity:1.5642658422862685\n",
      "epoch:10,time-mins:0,time-secs:43\n",
      "train loss:0.009299443398888867,train perplexity:1.0093428175702788\n",
      "val loss:0.44786572456359863, val perplexity:1.5649685447104873\n"
     ]
    }
   ],
   "source": [
    "from torch.utils.tensorboard import SummaryWriter\n",
    "writer = SummaryWriter(os.getcwd()+'/log', comment='intent_slot')\n",
    "\n",
    "\n",
    "source_emb_dim = 64\n",
    "slot_embed_dim = 64\n",
    "hidden_dim = 128\n",
    "n_layers = 1\n",
    "dropout = 0.5\n",
    "lr = 0.01\n",
    "gamma = 0.1\n",
    "weight_decay = 0.1\n",
    "n_epochs = 10\n",
    "clip = 5.0\n",
    "model_path = os.path.join(os.getcwd(), \"model.h5\")\n",
    "\n",
    "\n",
    "model, optimizer, scheduler, loss_slot, loss_intent = build_model(SOURCE,\n",
    "                                                                  TARGET,\n",
    "                                                                  LABEL,\n",
    "                                                                  source_emb_dim,\n",
    "                                                                  hidden_dim,\n",
    "                                                                  n_layers,\n",
    "                                                                  dropout,\n",
    "                                                                  slot_embed_dim,\n",
    "                                                                  lr,\n",
    "                                                                  gamma,\n",
    "                                                                  weight_decay)\n",
    "\n",
    "model, optimizer = amp.initialize(model, optimizer, opt_level='O1')\n",
    "\n",
    "train_model(model,\n",
    "            train_iter,\n",
    "            val_iter,\n",
    "            optimizer,\n",
    "            scheduler,\n",
    "            loss_slot, \n",
    "            loss_intent,\n",
    "            n_epochs,\n",
    "            clip,\n",
    "            model_path,\n",
    "            writer)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
